Including Plots

You can also embed plots, for example:

diabetes_dataset
## # A tibble: 100,000 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Male      28            0             0 never            27.3         5.7
##  4 Female    36            0             0 current          23.4         5  
##  5 Male      76            1             1 current          20.1         4.8
##  6 Female    20            0             0 never            27.3         6.6
##  7 Female    44            0             0 never            19.3         6.5
##  8 Female    79            0             0 No Info          23.9         5.7
##  9 Male      42            0             0 never            33.6         4.8
## 10 Female    32            0             0 never            27.3         5  
## # ℹ 99,990 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# male datatset
male_data = diabetes_dataset %>% filter(gender == "Male")
# female dataset
female_data = diabetes_dataset %>% filter(gender == "Female")
female_data
## # A tibble: 58,552 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Female    54            0             0 No Info          27.3         6.6
##  3 Female    36            0             0 current          23.4         5  
##  4 Female    20            0             0 never            27.3         6.6
##  5 Female    44            0             0 never            19.3         6.5
##  6 Female    79            0             0 No Info          23.9         5.7
##  7 Female    32            0             0 never            27.3         5  
##  8 Female    53            0             0 never            27.3         6.1
##  9 Female    54            0             0 former           54.7         6  
## 10 Female    78            0             0 former           36.0         5  
## # ℹ 58,542 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
# males and females within original dataset that have a "normal" A1C

female_data %>% filter(HbA1c_level <= 5.7) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 27397
male_data %>% filter(HbA1c_level <= 5.7) %>% tally()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 18865
# count of people (male and female) with both heart disease and diabetes

diabetes_dataset %>% filter(diabetes == 1, heart_disease == 1)
## # A tibble: 1,267 × 9
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Male      67            0             1 not current      27.3         6.5
##  2 Male      57            1             1 not current      27.8         6.6
##  3 Male      80            0             1 former           24.4         7.5
##  4 Male      75            0             1 not current      28.1         7.5
##  5 Male      69            0             1 former           24.1         6.8
##  6 Female    59            0             1 never            60.3         8.8
##  7 Male      80            0             1 former           33.0         6  
##  8 Female    62            1             1 former           44.2         8.2
##  9 Female    62            1             1 never            43.2         8.8
## 10 Female    76            0             1 former           25.7         9  
## # ℹ 1,257 more rows
## # ℹ 2 more variables: blood_glucose_level <dbl>, diabetes <dbl>
diabetes_dataset %>% filter(diabetes == 1, heart_disease == 1) %>% tally() # this is saying how many rows are in the data group and tally ***
## # A tibble: 1 × 1
##       n
##   <int>
## 1  1267
# count of overweight people based on bmi who have heart disease
diabetes_dataset %>% group_by(bmi >= 30) %>% filter(heart_disease == 1)
## # A tibble: 3,942 × 10
## # Groups:   bmi >= 30 [2]
##    gender   age hypertension heart_disease smoking_history   bmi HbA1c_level
##    <chr>  <dbl>        <dbl>         <dbl> <chr>           <dbl>       <dbl>
##  1 Female    80            0             1 never            25.2         6.6
##  2 Male      76            1             1 current          20.1         4.8
##  3 Female    72            0             1 former           27.9         6.5
##  4 Male      67            0             1 not current      27.3         6.5
##  5 Female    77            1             1 never            32.0         5  
##  6 Female    59            0             1 ever             23.1         6.5
##  7 Male      68            1             1 current          27.3         5  
##  8 Male      59            0             1 ever             30.8         5  
##  9 Female    80            0             1 never            29.6         5.8
## 10 Male      57            1             1 not current      27.8         6.6
## # ℹ 3,932 more rows
## # ℹ 3 more variables: blood_glucose_level <dbl>, diabetes <dbl>,
## #   `bmi >= 30` <lgl>

Diabetes Dataset

6 rows
gender age hypertension heart_disease smoking_history bmi HbA1c_level blood_glucose_level diabetes
Female 80 0 1 never 25.19 6.6 140 0
Female 54 0 0 No Info 27.32 6.6 80 0
Male 28 0 0 never 27.32 5.7 158 0
Female 36 0 0 current 23.45 5.0 155 0
Male 76 1 1 current 20.14 4.8 155 0
Female 20 0 0 never 27.32 6.6 85 0

Male vs. Female Blood Sugar Levels (HbA1c)

6 rows
gender age hypertension heart_disease smoking_history bmi HbA1c_level blood_glucose_level diabetes HbA1c_category
Female 80 0 1 never 25.19 6.6 140 0 Diabetes ≥ 6.5%
Female 54 0 0 No Info 27.32 6.6 80 0 Diabetes ≥ 6.5%
Male 28 0 0 never 27.32 5.7 158 0 Prediabetes 5.7% - 6.4%
Female 36 0 0 current 23.45 5.0 155 0 Normal < 5.7%
Male 76 1 1 current 20.14 4.8 155 0 Normal < 5.7%
Female 20 0 0 never 27.32 6.6 85 0 Diabetes ≥ 6.5%

Similar Prevalence of Prediabetes – The proportion of individuals categorized as having prediabetes (HbA1c 5.7% - 6.4%) is almost identical between males (41.3%) and females (41.4%). This suggests that prediabetes affects both genders at nearly the same rate.

  • Slightly Higher Diabetes Rates Among Males – More males (21.6%) fall into the diabetes (HbA1c ≥ 6.5%) category compared to females (20.2%). While the difference is small, it might indicate that men have a slightly higher risk of diabetes in this dataset.

Females Have a Slightly Higher Proportion of Normal Blood Sugar Levels – More females (38.4%) fall into the normal blood sugar category (<5.7%) compared to males (37.1%). This may indicate some slight protective factors or lifestyle differences in this group.

Since more males are in the diabetes category, there could be gender-related risk factors worth exploring—such as diet, activity levels, or genetic predisposition.

Overall, blood sugar regulation patterns appear fairly balanced between genders, but small differences suggest potential areas for further investigation.

<<<<<<< HEAD

Similar Prevalence of Prediabetes
The proportion of individuals classified as having prediabetes (HbA1c 5.7% - 6.4%) is nearly identical between males (41.3%) and females (41.4%). This suggests no significant disparity.

Similar Prevalence of Prediabetes – The proportion of individuals classified as having prediabetes (HbA1c 5.7% - 6.4%) is nearly identical between males (41.3%) and females (41.4%). This indicates that prediabetes affects both genders at a comparable rate, suggesting no significant disparity.

=======
>>>>>>> 632ad52a5942dd4becb7156103d52e172939cd91

BMI Distribution by Hypertension Status Plot

Shows the distribution of BMI values based on hypertension status. A violin plot is great for visualizing the distribution and density of BMI across hypertension categories,

Shape and width: The width of each “violin” represents the density of BMI values at different levels. Wider sections mean more individuals have that BMI, while narrower sections indicate fewer people at those values.

Comparison of distributions: The blue violin represents people without hypertension (hypertension = 0), while the red violin represents those with hypertension (hypertension = 1). By comparing them, you can see how BMI differs between these groups.

The horizontal line around 25 BMI: This marks the median BMI for each group. Since both violins have a horizontal line in roughly the same position, it suggests that the median BMI is around 25 for both hypertensive and non-hypertensive individuals.

Density trends: If the violins have different thicknesses in certain BMI ranges, it tells you which BMI values are more or less common in each group. People with hypertension seem to have a higher BMI overall, but both groups share a similar median.

The distribution shape is different—for example, if one violin is wider at higher BMI values, it suggests that hypertension is more common among individuals with higher BMI.

Outliers or extreme values might appear as small bulges or extended tails at the ends of the violins, showing individuals with very high or low BMI.

<<<<<<< HEAD
=======

The proper way to read the following chart is to notice the “thickness” of each distribution. When the graph expands wider, that means there are more people within the data range. Notice the plot for people without hypertension, there is a “wider” range of people that have a lower bmi. This trend is mirrored for people with hypertension, as theres a wider range of people that have a higher bmi.

>>>>>>> 632ad52a5942dd4becb7156103d52e172939cd91

Here I’ll leave extra info for you guys regarding the gender column of the original data set

diabetes_dataset %>% filter(gender == 'Female') %>% tally # 58,552 we have 17,122 more females than males in this data set
## # A tibble: 1 × 1
##       n
##   <int>
## 1 58552
diabetes_dataset %>% filter(gender == 'Male') %>% tally   # 41,430  
## # A tibble: 1 × 1
##       n
##   <int>
## 1 41430
diabetes_dataset %>% filter(gender == 'Other') %>% tally  # 18  
## # A tibble: 1 × 1
##       n
##   <int>
## 1    18

One of the more interesting bits of data is that there are individuals that have an a1c of over 6.5 yet are not considered diabetic.

Smokers go brrr

In the smoking data there are 6 unique values

  1. Never: Has Never smoked
  2. Not current: Has smoked but is not currently smoking
  3. Former: Has quit smoking (abstained for longer than)
  4. Current: Is currently a smoker
  5. Ever: Has ever smoked regardless of current smoking status
  6. No Info: No smoking history information available

The total amount of people who fall into each category is as follows;

  1. Never: 35095
  2. Not current: 6447
  3. Former: 9352
  4. Current: 9286
  5. Ever: 4004
  6. No Info: 35816

There is quite a sizable amount of people in the No info category.

The total number of people in the dataset is 100000. To help clean up the data, we can filter ‘No Info’ people out. When we do that we get 64184.

# Figure out the unique categories of smoking history 
unique(diabetes_dataset$smoking_history)
## [1] "never"       "No Info"     "current"     "former"      "ever"       
## [6] "not current"
# Count amount of people who belong to each unique category
diabetes_dataset %>% group_by(smoking_history) %>% summarise(total_people = n())
## # A tibble: 6 × 2
##   smoking_history total_people
##   <chr>                  <int>
## 1 No Info                35816
## 2 current                 9286
## 3 ever                    4004
## 4 former                  9352
## 5 never                  35095
## 6 not current             6447
smoking_diabetes_dataset <- diabetes_dataset %>% 
  filter(smoking_history != 'No Info') %>%  
  group_by(smoking_history, diabetes) %>%  
  summarise(total = n())
## `summarise()` has grouped output by 'smoking_history'. You can override using
## the `.groups` argument.

library(dplyr)
library(ggplot2)

# Tally data
obese_men <- male_data %>% filter(bmi >= 30, diabetes == 1) %>% tally() %>% pull(n)
underweight_men <- male_data %>% filter(bmi <= 19, diabetes == 1) %>% tally() %>% pull(n)
obese_women <- female_data %>% filter(bmi >= 30, diabetes == 1) %>% tally() %>% pull(n)
underweight_women <- female_data %>% filter(bmi <= 19, diabetes == 1) %>% tally() %>% pull(n)

# Create data frame
plot_data <- data.frame(
  Category = rep(c("Obese", "Underweight"), each = 2),
  Sex = c("Men", "Women", "Men", "Women"),
  Count = c(obese_men, obese_women, underweight_men, underweight_women)
)

# Calculate percentage within each category
plot_data <- plot_data %>%
  group_by(Category) %>%
  mutate(Percentage = Count / sum(Count) * 100)

# Plot
ggplot(plot_data, aes(x = Category, y = Count, fill = Sex)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = paste0(round(Percentage, 1), "%")), 
            position = position_dodge(width = 0.9), 
            vjust = -0.5, size = 4) +
  labs(
    title = "Diabetic Individuals by Weight Category and Sex",
    y = "Count",
    x = "BMI Category"
  ) +
  scale_fill_manual(values = c("Men" = "#1f77b4", "Women" = "#ff7f0e")) +
  theme_minimal()

library(dplyr)
library(ggplot2)

# Data prep
bmi_heart_gender <- diabetes_dataset %>%
  filter(heart_disease == 1) %>%
  mutate(
    bmi_category = case_when(
      bmi <= 19 ~ "Underweight",
      bmi >= 30 ~ "Overweight",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(bmi_category)) %>%
  group_by(gender, bmi_category) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(gender) %>%
  mutate(percentage = round(100 * count / sum(count), 1))

# Make sure the bmi_category is an ordered factor so the line connects correctly
bmi_heart_gender$bmi_category <- factor(bmi_heart_gender$bmi_category,
                                        levels = c("Underweight", "Overweight"))

# Dot + Line plot
ggplot(bmi_heart_gender, aes(x = bmi_category, y = count, group = gender, color = gender)) +
  geom_line(aes(group = gender), position = position_dodge(width = 0.5), linewidth = 1) +
  geom_point(size = 5, position = position_dodge(width = 0.5)) +
  geom_text(aes(label = paste0(count, " (", percentage, "%)")),
            vjust = -1,
            position = position_dodge(width = 0.5)) +
  labs(
    title = "Heart Disease Cases by Gender and BMI Category",
    x = "BMI Category",
    y = "Count of People with Heart Disease",
    color = "Gender"
  ) +
  theme_minimal()

library(dplyr)
library(plotly)

# Data prep
bmi_heart_gender <- diabetes_dataset %>%
  filter(heart_disease == 1) %>%
  mutate(
    bmi_category = case_when(
      bmi <= 19 ~ "Underweight",
      bmi >= 30 ~ "Overweight",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(!is.na(bmi_category)) %>%
  group_by(gender, bmi_category) %>%
  summarise(count = n(), .groups = 'drop') %>%
  group_by(gender) %>%
  mutate(percentage = round(100 * count / sum(count), 1))

# Factor levels to ensure consistent order
bmi_heart_gender$bmi_category <- factor(bmi_heart_gender$bmi_category,
                                        levels = c("Underweight", "Overweight"))

# Convert categories to numeric x values
bmi_heart_gender$x <- as.numeric(bmi_heart_gender$bmi_category)

# Spread into wide format for line intersection math
library(tidyr)
wide <- bmi_heart_gender %>%
  select(gender, bmi_category, count, x) %>%
  pivot_wider(names_from = gender, values_from = count)

# Calculate intersection manually between the 2 points
# We'll assume linear segments between the two BMI categories
x1 <- wide$x[1]; x2 <- wide$x[2]
y_male1 <- wide$Male[1]; y_male2 <- wide$Male[2]
y_fem1 <- wide$Female[1]; y_fem2 <- wide$Female[2]

# Solve for intersection of two lines
# Line 1: Male = y = m1*x + b1
# Line 2: Female = y = m2*x + b2
m1 <- (y_male2 - y_male1) / (x2 - x1)
m2 <- (y_fem2 - y_fem1) / (x2 - x1)
b1 <- y_male1 - m1 * x1
b2 <- y_fem1 - m2 * x1
x_int <- (b2 - b1) / (m1 - m2)
y_int <- m1 * x_int + b1

# Interactive Plot
fig <- plot_ly()

# Add male and female lines
fig <- fig %>%
  add_trace(data = bmi_heart_gender,
            x = ~x, y = ~count, type = 'scatter', mode = 'lines+markers+text',
            color = ~gender, text = ~paste("Count:", count, "<br>%:", percentage),
            textposition = "top center", name = ~gender)

# Add intersection point
fig <- fig %>%
  add_trace(x = c(x_int), y = c(y_int),
            type = 'scatter', mode = 'markers+text',
            marker = list(size = 12, color = 'black', symbol = "x"),
            name = "Intersection",
            text = paste0("Intersection<br>x: ", round(x_int, 2), "<br>y: ", round(y_int, 2)),
            textposition = "bottom center",
            hoverinfo = "text")

fig <- fig %>%
  layout(
    title = "Interactive Plot: Heart Disease Counts by Gender and BMI Category",
    xaxis = list(title = "BMI Category", tickvals = c(1, 2), ticktext = c("Underweight", "Overweight")),
    yaxis = list(title = "Count"),
    showlegend = TRUE
  )

fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels